library(Seurat)
library(Matrix)
library(ggplot2)
library(cowplot)
library(rafalib)
library(clustree)
library(venn)
library(dplyr)
if(!require(DoubletFinder)){
remotes::install_github("chris-mcginnis-ucsf/DoubletFinder", upgrade = F)
library(DoubletFinder)
}
# The cell cycle gene lists were generated as in the commented chunk. To avoid changes related to future versions of Seurat or BioMart, the lists have been hardcoded below, as generated June 2021.
# convertHumanGeneList <- function(x){
# require("biomaRt")
# human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
# mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
#
# genesV2 = getLDS(attributes = c("hgnc_symbol"), filters = "hgnc_symbol", values = x , mart = human, attributesL = c("mgi_symbol"), martL = mouse, uniqueRows=T)
#
# humanx <- unique(genesV2[, 2])
#
# # Print the first 6 genes found to the screen
# #print(head(humanx))
# return(humanx)
# }
# g2mFeat = convertHumanGeneList(cc.genes$g2m.genes)
# sFeat = convertHumanGeneList(cc.genes$s.genes)
g2mFeat = c("Smc4","Dlgap5","Ctcf","Cdc25c","Gtse1","Kif20b","Ttk","Tacc3","Top2a","Tpx2","Cdca3","Ncapd2",
"Cks2","Anp32e","G2e3","Lbr","Cdca2","Cks1brt","Ckap2","Hmgb2","Kif11","Nek2","Cenpe","Hjurp",
"Ect2","Aurkb","Cks1b","Kif23","Nuf2","Hmmr","Cdca8","Psrc1","Anln","Cdc20","Birc5","Ndc80",
"Rangap1","Ckap5","Kif2c","Cenpf","Nusap1","Cenpa","Aurka","Ube2c","Ckap2l","Mki67","Tubb4b","Bub1",
"Ccnb2")
sFeat = c("Msh2","Exo1","Mcm4","Rrm2","Mcm2","Chaf1b","Gmnn","Cdc45","Slbp","Ubr7","Cdc6",
"Rad51ap1","Rpa2","Hells","Fen1","Gins2","Uhrf1","Mcm6","Ung","Dscc1","Usp1","Clspn",
"Cdca7","Pola1","Nasp","Dtl","Mcm5","Wdr76","Prim1","Casp8ap2","Tipin","Blm","Rrm1",
"Brip1","Rad51","Tyms","E2f8","Rfc2","Ccne2","Pcna")
Raw sequences were processed in CellRanger v 6.0.2. Here, filtered h5-files as generated by CellRanger are combined into a Seurat object.
if(!file.exists("analysis/original_data.rds")){
DMSO = Seurat::Read10X_h5(filename = "raw_data/G_DMSO/filtered_feature_bc_matrix.h5", use.names = T)
GW = Seurat::Read10X_h5(filename = "raw_data/G_GW/filtered_feature_bc_matrix.h5", use.names = T)
sdata.DMSO <- CreateSeuratObject(DMSO, project = "DMSO")
sdata.GW <- CreateSeuratObject(GW, project = "GW")
alldata <- merge(sdata.DMSO, sdata.GW, add.cell.ids = c("DMSO","GW"))
rm(DMSO, GW, sdata.GW, sdata.DMSO)
gc(verbose= FALSE)
saveRDS(alldata, "analysis/original_data.rds")
}else{
alldata = readRDS("analysis/original_data.rds")
}
Cell counts:
| Var1 | Freq |
|---|---|
| DMSO | 14950 |
| GW | 13487 |
Number of features and counts and percentage of mitochondrial and ribosomal RNA are plotted below. Red lines mark limits for filtering.
# Calculate % of mitochondrial and ribosomal genes per cell
alldata <- PercentageFeatureSet(alldata, "^mt-", col.name = "percent_mito")
alldata <- PercentageFeatureSet(alldata, "^Rp[sl]", col.name = "percent_ribo")
# Visualize QC variables with cutoffs
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")
glist = VlnPlot(alldata, group.by = "orig.ident", features = feats, pt.size = 0.1, combine = FALSE)
names(glist) = feats
for(i in feats){
glist[[i]] = glist[[i]] +
theme(legend.position = "none")
}
cowplot::plot_grid(align = "v",plotlist = list(glist[["nFeature_RNA"]] + geom_hline(yintercept = 500, color = "red"),
glist[["nCount_RNA"]],
glist[["percent_mito"]] + geom_hline(yintercept = 10, color = "red") ,
glist[["percent_ribo"]] + geom_hline(yintercept = 5, color = "red")))
The same QC as above are plotted below, only including cells and genes that pass filtering limits (cell limits: > 500 features & > 5% ribosomal RNA & < 10% mitochondrial RNA; gene limit: > 3 copies expressed). The top expressed genes are also shown. Mitochondrial genes are filtered out of the data.
# subset the object to only keep cells with > 500 features and genes expressed in > 3 copies
selected_c <- WhichCells(alldata, expression = nFeature_RNA > 500)
selected_f <- rownames(alldata)[Matrix::rowSums(alldata) > 3]
data.filt <- subset(alldata, features = selected_f, cells = selected_c)
selected_mito <- WhichCells(data.filt, expression = percent_mito < 10)
selected_ribo <- WhichCells(data.filt, expression = percent_ribo > 5)
# subset the object to only keep cells with > 5% ribosomal RNA and <10% mitochondrial RNA
data.filt <- subset(data.filt, cells = selected_mito)
data.filt <- subset(data.filt, cells = selected_ribo)
# Visualize QC variables in filtered data
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")
VlnPlot(data.filt, group.by = "orig.ident", features = feats, pt.size = 0.1, ncol = 2) +
NoLegend()
# Identify most highly expressed genes (% of counts)
par(mar = c(4, 8, 2, 1))
C <- data.filt@assays$RNA@counts
C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1]
boxplot(as.matrix(t(C[most_expressed, ])), cex = 0.1, las = 1, xlab = "% total count per cell",
col = (scales::hue_pal())(20)[20:1], horizontal = TRUE)
# Filter Mitocondrial genes
data.filt <- data.filt[!grepl("^mt-", rownames(data.filt)), ]
The S-phase and G2M-phase scores are calculated based on the cell cycle gene lists defined in the Seurat package (translated to mouse genes using biomaRt).
# Before running CellCycleScoring the data need to be normalized and
# logtransformed.
data.filt = NormalizeData(data.filt)
data.filt <- CellCycleScoring(object = data.filt, g2m.features = g2mFeat,
s.features = sFeat)
VlnPlot(data.filt, features = c("S.Score", "G2M.Score"), group.by = "orig.ident",
ncol = 3, pt.size = 0.1)
Doublets are identified and excluded using the DoubletFinder package.
data.filt = FindVariableFeatures(data.filt, verbose = F)
data.filt = ScaleData(data.filt, vars.to.regress = c("nFeature_RNA", "percent_mito"),
verbose = F)
data.filt = RunPCA(data.filt, verbose = F, npcs = 20)
data.filt = RunUMAP(data.filt, dims = 1:10, verbose = F)
nExp <- round(ncol(data.filt) * 0.08) # expect 8% doublets based on 10,000 cells
data.filt <- doubletFinder_v3(data.filt, pN = 0.26, pK = 0.20, nExp = nExp, PCs = 1:10)
DF.name = colnames(data.filt@meta.data)[grepl("DF.classification", colnames(data.filt@meta.data))]
cowplot::plot_grid(ncol = 2, DimPlot(data.filt, group.by = "orig.ident") + NoAxes(),
DimPlot(data.filt, group.by = DF.name) + NoAxes())
VlnPlot(data.filt, features = "nFeature_RNA", group.by = DF.name, pt.size = 0.1)
data.filt = data.filt[, data.filt@meta.data[, DF.name] == "Singlet"]
## [1] "Creating 9239 artificial doublets..."
## [1] "Creating Seurat object..."
## [1] "Normalizing Seurat object..."
## [1] "Finding variable genes..."
## [1] "Scaling data..."
## [1] "Running PCA..."
## [1] "Calculating PC distance matrix..."
## [1] "Computing pANN..."
## [1] "Classifying doublets.."
Final cell counts:
| Var1 | Freq |
|---|---|
| DMSO | 12526 |
| GW | 11665 |
The filtered data is saved under analysis/filtered_data.rds
saveRDS(data.filt, file = "analysis/filtered_data.rds")
rm(list=ls())
The most highly variable genes in the dataset are plotted below. The top 2000 genes are used for the following dimensionality reduction strategies.
alldata = readRDS("analysis/filtered_data.rds")
suppressWarnings(suppressMessages(alldata <- FindVariableFeatures(alldata, selection.method = "vst",
nfeatures = 2000, verbose = FALSE, assay = "RNA")))
top20 <- head(VariableFeatures(alldata), 20)
LabelPoints(plot = VariableFeaturePlot(alldata), points = top20, repel = TRUE)
# Z-score transformation
alldata <- ScaleData(alldata, vars.to.regress = c("percent_mito", "nFeature_RNA"),
assay = "RNA")
PCA is performed for 50 PCs.
alldata <- RunPCA(alldata, npcs = 50, verbose = F)
plot_grid(ncol = 3, DimPlot(alldata, reduction = "pca", group.by = "orig.ident", dims = 1:2) + NoLegend(),
DimPlot(alldata, reduction = "pca", group.by = "orig.ident", dims = 3:4) + NoLegend(),
DimPlot(alldata, reduction = "pca", group.by = "orig.ident", dims = 5:6) +
theme(legend.position = "bottom"))
VizDimLoadings(alldata, dims = 1:5, reduction = "pca", ncol = 5, balanced = T)
ElbowPlot(alldata, reduction = "pca", ndims = 50)
t-SNE and UMAP are performed on the PCA reduction.
alldata <- RunTSNE(alldata, reduction = "pca", dims = 1:30, perplexity = 30, max_iter = 1000,
theta = 0.5, eta = 200, num_threads = 0)
alldata <- RunUMAP(alldata, reduction = "pca", dims = 1:30, n.components = 2, n.neighbors = 30,
n.epochs = 200, min.dist = 0.3, learning.rate = 1, spread = 1)
plot_grid(ncol = 3, DimPlot(alldata, reduction = "pca", group.by = "orig.ident") +
theme(legend.position = "bottom"),
DimPlot(alldata, reduction = "tsne", group.by = "orig.ident") + NoLegend(),
DimPlot(alldata, reduction = "umap", group.by = "orig.ident") + NoLegend())
The Seurat object with PCA, t-SNE and UMAP reductions is saved under analysis/filtered_data_dr.rds
saveRDS(alldata, file = "analysis/filtered_data_dr.rds")
rm(list=ls())
Top 2000 variable features are identified for each sample separately. Overlap in top variable features between samples is shown as a Venn diagram.
alldata = readRDS("analysis/filtered_data_dr.rds")
alldata.list <- SplitObject(alldata, split.by = "orig.ident")
for (i in 1:length(alldata.list)) {
alldata.list[[i]] <- NormalizeData(alldata.list[[i]], verbose = FALSE)
alldata.list[[i]] <- FindVariableFeatures(alldata.list[[i]], selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
}
hvgs_per_dataset <- lapply(alldata.list, function(x) {
x@assays$RNA@var.features
})
venn::venn(hvgs_per_dataset, opacity = 0.4, zcolor = (scales::hue_pal())(3), cexsn = 1,
cexil = 1, lwd = 1, col = "white", frame = F, borders = NA)
Samples are integrated using Seurat integration functionality with CCA.
alldata.anchors <- FindIntegrationAnchors(object.list = alldata.list, dims = 1:30,
reduction = "cca")
alldata.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:30, new.assay.name = "CCA")
rm(alldata.list, alldata.anchors)
gc(verbose = FALSE)
# Run Dimensionality reduction on integrated space
alldata.int <- ScaleData(alldata.int, verbose = FALSE)
alldata.int <- RunPCA(alldata.int, npcs = 30, verbose = FALSE)
alldata.int <- RunUMAP(alldata.int, dims = 1:30)
alldata.int <- RunTSNE(alldata.int, dims = 1:30)
plot_grid(ncol = 3,
DimPlot(alldata, reduction = "pca", group.by = "orig.ident") + NoAxes() + ggtitle("PCA raw_data"),
DimPlot(alldata, reduction = "tsne", group.by = "orig.ident") + NoAxes() + ggtitle("tSNE raw_data"),
DimPlot(alldata, reduction = "umap", group.by = "orig.ident") + NoAxes() + ggtitle("UMAP raw_data"),
DimPlot(alldata.int, reduction = "pca", group.by = "orig.ident") + NoAxes() + ggtitle("PCA integrated"),
DimPlot(alldata.int, reduction = "tsne", group.by = "orig.ident") + NoAxes() + ggtitle("tSNE integrated"),
DimPlot(alldata.int, reduction = "umap", group.by = "orig.ident") + NoAxes() + ggtitle("UMAP integrated")
)
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3170286 169.4 5324025 284.4 5324025 284.4
## Vcells 487591494 3720.1 1529514480 11669.3 3200132041 24415.1
QC variables are visualized in UMAP of the integrated dataset.
FeaturePlot(alldata.int, reduction = "umap",
features = c("nFeature_RNA", "percent_mito","percent_ribo", "S.Score", "G2M.Score"),
order = T, slot = "data", combine = T)
The Seurat object with CCA integration is saved under analysis/filtered_data_int.rds
saveRDS(alldata.int, file = "analysis/filtered_data_int.rds")
rm(list=ls())
alldata <- readRDS("analysis/filtered_data_int.rds")
Clustering is performed using FindNeighbors on CCA assay and FindClusters using Louvain algorithm at several resolutions. Results are shown for resolutions 0.5, 1, 1.5 and 2. Resolution 0.5 is selected for further analysis.
DefaultAssay(alldata) = "CCA"
alldata <- FindNeighbors(alldata, dims = 1:30, k.param = 200, prune.SNN = 1/15)
# Clustering with louvain (algorithm 1)
for (res in c(0.1, 0.25, 0.5, 1, 1.5, 2)) {
alldata <- FindClusters(alldata, graph.name = "CCA_snn", resolution = res, algorithm = 1)
}
# each time you run clustering, the data is stored in meta data columns:
# seurat_clusters - lastest results only CCA_snn_res.XX - for each different
# resolution you test.
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24191
## Number of edges: 8932737
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9451
## Number of communities: 5
## Elapsed time: 44 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24191
## Number of edges: 8932737
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9036
## Number of communities: 6
## Elapsed time: 37 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24191
## Number of edges: 8932737
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8588
## Number of communities: 9
## Elapsed time: 44 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24191
## Number of edges: 8932737
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7951
## Number of communities: 11
## Elapsed time: 35 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24191
## Number of edges: 8932737
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7428
## Number of communities: 11
## Elapsed time: 33 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 24191
## Number of edges: 8932737
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6946
## Number of communities: 17
## Elapsed time: 28 seconds
plot_grid(ncol = 2, DimPlot(alldata, reduction = "umap", group.by = "CCA_snn_res.0.5", label = TRUE) +
ggtitle("louvain_0.5") + NoLegend(),
DimPlot(alldata, reduction = "umap", group.by = "CCA_snn_res.1", label = TRUE) +
ggtitle("louvain_1") + NoLegend(),
DimPlot(alldata, reduction = "umap", group.by = "CCA_snn_res.1.5", label = TRUE) +
ggtitle("louvain_1.5") + NoLegend(),
DimPlot(alldata, reduction = "umap", group.by = "CCA_snn_res.2", label = TRUE) +
ggtitle("louvain_2") + NoLegend())
clustree(alldata@meta.data, prefix = "CCA_snn_res.")
QC variables are visualized for each cluster as established with the Louvain algorithm at resolution 0.5.
# Set the identity as louvain with resolution 0.5
sel.clust = "CCA_snn_res.0.5"
alldata <- SetIdent(alldata, value = sel.clust)
VlnPlot(alldata, features = c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo",
"S.Score", "G2M.Score"), group.by = "CCA_snn_res.0.5" ,
ncol = 3, pt.size = 0.1)
Number of cells per cluster:
| DMSO | GW | |
|---|---|---|
| 0 | 2543 | 2056 |
| 1 | 1978 | 1998 |
| 2 | 2033 | 1821 |
| 3 | 1848 | 1741 |
| 4 | 1345 | 1238 |
| 5 | 1158 | 1299 |
| 6 | 970 | 926 |
| 7 | 481 | 420 |
| 8 | 170 | 166 |
alldata$ident_clust = paste0(alldata@meta.data[, sel.clust],"_",alldata$orig.ident)
alldata <- ScaleData(alldata, assay = "RNA")
ggplotColours <- function(n = 6, h = c(0, 360) + 15){
if ((diff(h) %% 360) < 1) h[2] <- h[2] - 360/n
hcl(h = (seq(h[1], h[2], length = n)), c = 100, l = 65)
}
color_list <- ggplotColours(n=length(unique(alldata@meta.data[, sel.clust])))
plot_grid(ncol = 3, DimPlot(alldata, label = T) + NoAxes() + theme(legend.position = "none"),
DimPlot(alldata, cells = alldata$orig.ident == "DMSO") + NoAxes() +
labs(title="DMSO") + theme(legend.position = "none", plot.title = element_text(hjust = 0.5)),
DimPlot(alldata, cells = alldata$orig.ident == "GW") + NoAxes() +
labs(title="GW") + theme(legend.position = "none", plot.title = element_text(hjust = 0.5)))
frequencies = as.data.frame(table(data.frame(cluster=alldata@meta.data[, sel.clust],
sample = alldata$orig.ident)))
frequencies$Proportion=NA
for(i in 1:nrow(frequencies)){
frequencies$Proportion[i] = frequencies$Freq[i]/sum(frequencies$Freq[frequencies$sample==frequencies$sample[i]])
}
ggplot(frequencies, aes(fill=sample, y = Proportion, x = cluster)) +
geom_bar(stat = "identity",position="dodge", size = 0.3, color = "black") +
theme(panel.background = element_blank(),axis.ticks.y = element_blank(),
axis.text = element_text(size = 12)) +
scale_fill_manual(values = c("#dfdfde", "#bdc9e2"))
The Seurat object with clustering information is saved under analysis/filtered_data_clus.rds
saveRDS(alldata, file = "analysis/filtered_data_clus.rds")
Markers for each cluster are identified with thresholds logFC > 0.2, p < 0.01, difference in % > 0.2. Top 25 (by p-value) genes are shown in bar graphs and heatmap.
markers_genes <- FindAllMarkers(alldata, logfc.threshold = 0.2, test.use = "wilcox",
min.pct = 0.1, min.diff.pct = 0.2, only.pos = TRUE, max.cells.per.ident = 50,
assay = "RNA", random.seed = 42)
write.csv2(markers_genes, "analysis/diffexpr_clusters_ribo5.csv")
top25 <- markers_genes %>% group_by(cluster) %>% top_n(-25, p_val)
mypar(2, 5, mar = c(4, 6, 3, 1))
for (i in unique(top25$cluster)) {
barplot(sort(setNames(top25$avg_logFC, top25$gene)[top25$cluster == i], F), horiz = T,
las = 1, main = paste0(i, " vs. rest"), border = "white", yaxs = "i", xlab="logFC")
abline(v = c(0, 0.2), lty = c(1, 2))
}
DoHeatmap(alldata, features = as.character(unique(top25$gene)), assay ="RNA",label = FALSE) + labs(color = "Cluster")
Comparisons between clusters are shown for the top 5 genes (by p-value) of each cluster.
top5 <- markers_genes %>% group_by(cluster) %>% top_n(-5, p_val_adj)
DotPlot(alldata, features = rev(as.character(unique(top5$gene))), group.by = sel.clust,
assay = "RNA") + coord_flip()
Top 3 gens (by p-value) for each cluster are visualized with violin plots.
top3 <- top5 %>% group_by(cluster) %>% top_n(-3, p_val)
# set pt.size to zero if you do not want all the points to hide the violin
# shapes, or to a small value like 0.1
VlnPlot(alldata, features = as.character(unique(top3$gene)), ncol = 5, group.by = sel.clust,
assay = "RNA", pt.size = -1)
DMSO and GW3965 conditions are compared for each cluster separately. DE genes are identified with thresholds logFC > 0.2, p < 0.01. Top 25 genes (by p-value) are shown in bar graphs. Top 5 genes (by p-value) are shown in violin plots. All DE genes across clusters are shown as a dot plot and a heatmap. A list of all genes (no thresholds) is also generated and saved to the analysis folder for analysis of specific genes.
DGE_list = list()
for(i in 0:max(as.numeric(as.character(alldata@meta.data[, sel.clust])))){
# select all cells in cluster i
cell_selection <- subset(alldata, cells = colnames(alldata)[alldata@meta.data[, sel.clust] ==
i])
cell_selection <- SetIdent(cell_selection, value = "orig.ident")
# Compute differentiall expression
DGE_cell_selection <- FindAllMarkers(cell_selection, logfc.threshold = 0.2, test.use = "wilcox",
min.pct = 0.1, only.pos = TRUE, max.cells.per.ident = 150,
assay = "RNA", random.seed = 42)
DGE_list[[i+1]] = DGE_cell_selection
}
write.csv2(do.call("rbind", lapply(1:length(DGE_list), function(x){xdf = DGE_list[[x]]; xdf$cluster_no = x-1; xdf})), "analysis/diffexpr_GWvsDMSO_clusters.csv")
for(i in 1:length(unique(alldata@meta.data[, sel.clust]))){
mypar(1, 2, mar = c(4, 6, 3, 1))
thiscluster = DGE_list[[i]]
top25_cell_selection <- DGE_list[[i]] %>% group_by(cluster) %>% top_n(-25, p_val)
for (j in c("DMSO","GW")) {
if(sum(top25_cell_selection$cluster == j)>0){
barplot(sort(setNames(top25_cell_selection$avg_logFC, top25_cell_selection$gene)[top25_cell_selection$cluster == j], F), horiz = T,
las = 1, main = paste("Cluster",i-1,"Up in",j), border = "white", yaxs = "i", xlab = "logFC")
abline(v = c(0, 0.20), lty = c(1, 2))
}
}
top5_cell_selection <- DGE_list[[i]] %>% group_by(cluster) %>% top_n(-5, p_val)
plotlist = VlnPlot(cell_selection, features = as.character(unique(top5_cell_selection$gene)),
ncol = 5, group.by = "orig.ident", assay = "RNA", pt.size = 0.1, combine = FALSE)
plotlist = lapply(1:length(plotlist), function(i){plotlist[[i]] +
labs(title = paste(top5_cell_selection$gene[[i]])) +
theme(legend.position = "none",
plot.title = element_text(size = 12),
axis.title = element_text(size = 10),
axis.text = element_text(size = 10))})
cowplotlist = list(ggdraw() +
draw_label(
paste("Cluster",i-1),
fontface = 'bold',
x = 0,
hjust = 0,
size = 16
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
))
if(sum(top5_cell_selection$cluster=="DMSO")>0){
cowplotlist[[length(cowplotlist)+1]] = ggdraw() +
draw_label(
"Up in DMSO",
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
cowplotlist[[length(cowplotlist)+1]] = cowplot::plot_grid(plotlist = plotlist[top5_cell_selection$cluster=="DMSO"], ncol = 5)
}
if(sum(top5_cell_selection$cluster=="GW")>0){
cowplotlist[[length(cowplotlist)+1]] = ggdraw() +
draw_label(
"Up in GW",
fontface = 'bold',
x = 0,
hjust = 0
) +
theme(
# add margin on the left of the drawing canvas,
# so title is aligned with left edge of first plot
plot.margin = margin(0, 0, 0, 7)
)
cowplotlist[[length(cowplotlist)+1]] = cowplot::plot_grid(plotlist = plotlist[top5_cell_selection$cluster=="GW"], ncol = 5)
}
print(plot_grid(plotlist = cowplotlist,
ncol = 1,
# rel_heights values control vertical title margins
rel_heights = c(0.15, 0.1, 1, 0.1, 1)[1:length(cowplotlist)]
))
}
all_cluster_DGE = do.call("rbind", lapply(1:length(DGE_list), function(x){xdf = DGE_list[[x]]; xdf$cluster_no = x-1; xdf}))
all_cluster_DGE$gene = factor(all_cluster_DGE$gene, levels = unique(sort(all_cluster_DGE$gene)))
cowplot::plot_grid(ggplot(all_cluster_DGE[all_cluster_DGE$cluster=="DMSO",], aes(y = cluster_no, x = factor(gene, levels = sort(unique(gene), decreasing = TRUE)))) + geom_point(aes(size = -log10(p_val), color = avg_logFC)) +
theme(panel.background = element_blank(), panel.grid.minor = element_line(color="grey60"),
axis.text = element_text(size = 10),
axis.ticks.x = element_blank(), legend.key = element_blank(),
legend.position = "bottom",
legend.box = "vertical") +
scale_y_continuous(breaks = min(all_cluster_DGE$cluster_no):max(all_cluster_DGE$cluster_no)) +
scale_color_gradientn(colors = c("#CCCCFF", "#0000AA"), limits = c(0,max(all_cluster_DGE$avg_logFC))) +
coord_flip(clip ="off") +
labs(x = "Gene", y = "Cluster", title = "Up in DMSO") +
scale_size(limits = c(0,max(-log10(all_cluster_DGE$p_val)))),
ggplot(all_cluster_DGE[all_cluster_DGE$cluster=="GW",], aes(y = cluster_no,
x = factor(gene, levels = sort(unique(gene),
decreasing = TRUE)))) +
geom_point(aes(size = -log10(p_val), color = avg_logFC)) +
theme(panel.background = element_blank(), panel.grid.minor = element_line(color="grey60"),
axis.text = element_text(size = 10),
axis.ticks.x = element_blank(), legend.key = element_blank(),
legend.position = "bottom",
legend.box = "vertical") +
scale_y_continuous(breaks = min(all_cluster_DGE$cluster_no):max(all_cluster_DGE$cluster_no)) +
coord_flip(clip ="off") +
labs(x = "Gene", y = "Cluster", title = "Up in GW3965") +
scale_size(limits = c(0,max(-log10(all_cluster_DGE$p_val)))) +
scale_color_gradientn(colors = c("#FFCCCC", "#AA0000"), limits = c(0,max(all_cluster_DGE$avg_logFC))),
ncol = 2, align = "h")
hmgenes = unlist(lapply(DGE_list, function(x){x$gene}))
if(sum(apply(table(hmgenes, unlist(lapply(DGE_list, function(x){x$cluster}))), 1, function(x){sum(x==0)})!=1)>0){
print("Opposite genes!")
}else{
hmgenes = unique(hmgenes)
degenes = do.call("rbind", DGE_list)[hmgenes,]
degenes = degenes[order(degenes$cluster),]
DoHeatmap(alldata, features = degenes$gene, assay = "RNA", label = FALSE, group.by = "ident_clust",
group.colors = rep(color_list,rep(2,9))) +
theme(legend.position = "bottom")
}
DGE_list = list()
for(i in 0:(length(unique(alldata@meta.data[, sel.clust]))-1)){
# select all cells in cluster i
cell_selection <- subset(alldata, cells = colnames(alldata)[alldata@meta.data[, sel.clust] ==
i])
cell_selection <- SetIdent(cell_selection, value = "orig.ident")
# Compute differentiall expression
DGE_cell_selection <- FindAllMarkers(cell_selection, logfc.threshold = -Inf, test.use = "wilcox",
min.pct = -Inf, only.pos = TRUE, max.cells.per.ident = 150, return.thresh = 1,
assay = "RNA", random.seed = 3415142)
DGE_list[[i+1]] = DGE_cell_selection
}
write.csv2(do.call("rbind", lapply(1:length(DGE_list), function(x){xdf = DGE_list[[x]]; xdf$cluster_no = x-1; xdf})), "analysis/diffexpr_GWvsDMSO_clusters_nothreshold.csv")
getsignif = function(ps){
sigs = rep("",length(ps))
sigs[ps<0.05] = "*"
sigs[ps<0.01] = "**"
sigs[ps<0.001] = "***"
sigs[ps<0.0001] = "****"
sigs[ps<0.00001] = "*****"
sigs[ps<0.000001] = "******"
return(sigs)
}
egfgenes = c("Areg","Ereg","Tgfa","Egf","Egfr")
wntgenes = c("Ascl2", "Axin2", "Wnt3", "Wnt9b")
notchgenes = c("Dll1", "Dll4", "Hes1", "Jag1", "Jag2", "Notch1", "Notch2", "Notch3")
alldata@active.assay="RNA"
allLogFCs = read.csv2("analysis/diffexpr_GWvsDMSO_clusters_nothreshold.csv")
allLogFCs$logFC = allLogFCs$avg_logFC*c(-1,1)[(allLogFCs$cluster=="GW")+1]
allLogFCs$Cluster = factor(allLogFCs$cluster_no)
allLogFCs$sig = getsignif(allLogFCs$p_val)
egfplot = (ggplot(allLogFCs[allLogFCs$gene %in% egfgenes,], aes(fill = Cluster, y = logFC, x = gene)) +
geom_bar(stat = "identity", position = "dodge", width = 0.75) +
geom_text(aes(label = sig, #label=signif(p_val, 2),
y = pmax(logFC, 0)), vjust = -1, position = position_dodge(width = 0.75)) +
theme(panel.background = element_blank(), axis.ticks.x = element_blank()))
wntplot = (ggplot(allLogFCs[allLogFCs$gene %in% wntgenes,], aes(fill = Cluster, y = logFC, x = gene)) +
geom_bar(stat = "identity", position = "dodge", width = 0.75) +
geom_text(aes(label = sig, #label=signif(p_val, 2),
y = pmax(logFC, 0)), vjust = -1, position = position_dodge(width = 0.75)) +
theme(panel.background = element_blank(), axis.ticks.x = element_blank()))
notchplot = (ggplot(allLogFCs[allLogFCs$gene %in% notchgenes,],
aes(fill = Cluster, y = logFC, x = gene)) +
geom_bar(stat = "identity", position = "dodge", width = 0.75) +
geom_text(aes(label = sig, #label=signif(p_val, 2),
y = pmax(logFC, 0)), vjust = -1, position = position_dodge(width = 0.75)) +
theme(panel.background = element_blank(), axis.ticks.x = element_blank()))
mymax = max(egfplot$data[,"logFC"],wntplot$data[,"logFC"],notchplot$data[,"logFC"])
mymin = min(egfplot$data[,"logFC"],wntplot$data[,"logFC"],notchplot$data[,"logFC"])
cowplot::plot_grid(
wntplot +
scale_y_continuous(limits = c(mymin,mymax*1.1)) +
theme(axis.text.x = element_text(size =12), axis.title.x = element_blank(), legend.position = "none"),
notchplot +
scale_y_continuous(limits = c(mymin,mymax*1.1)) +
theme(axis.text.x = element_text(size =12), axis.title.x = element_blank(), legend.position = "none"),
egfplot +
scale_y_continuous(limits = c(mymin,mymax*1.1)) +
theme(axis.text.x = element_text(size =12), axis.title.x = element_blank()),
ncol = 3)
pwLogFCs = allLogFCs[allLogFCs$gene %in% c(egfgenes,wntgenes,notchgenes),]
pwLogFCs$gene = factor(pwLogFCs$gene, levels = c(egfgenes,wntgenes,notchgenes))
ggplot(pwLogFCs, aes(x = gene, y = logFC, fill = Cluster)) +
geom_bar(stat ="identity", position="dodge") +
geom_text(aes(label = sig, y = pmax(logFC, 0)), vjust = 0, position = position_dodge(width = 0.9)) +
theme(panel.background = element_blank())
Abca1df = cbind(alldata@assays$RNA@data["Abca1",], alldata@meta.data)
colnames(Abca1df)[1] = "Abca1"
class(Abca1df)
plot_grid(
ggplot(allLogFCs[allLogFCs$gene=="Areg",], aes(x = factor(Cluster,levels=100:0), y = logFC, fill = Cluster)) + geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(x = "Cluster", y = "logFC (GW3965/DMSO)", title = "Areg") +
geom_text(aes(label=sig, y = pmax(logFC, 0)), vjust = 0, hjust = 2) +
theme(panel.background = element_blank(), plot.title = element_text(hjust = 0.5), axis.ticks.x = element_blank()) +
coord_flip(),
ggplot(allLogFCs[allLogFCs$gene=="Abca1",], aes(x = factor(Cluster,levels=100:0), y = logFC, fill = Cluster)) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(x = "Cluster", y = "logFC (GW3965/DMSO)", title = "Abca1") +
geom_text(aes(label=sig, y = pmax(logFC, 0)), vjust = 0, hjust = 2) +
theme(panel.background = element_blank(), plot.title = element_text(hjust = 0.5), axis.ticks.x = element_blank()) +
coord_flip(),
ggplot(Abca1df , aes(x = Abca1, y = factor(CCA_snn_res.0.5,levels = 100:0), fill = factor(orig.ident, levels = c("GW","DMSO")))) +
stat_summary(geom="bar", position = position_dodge(width = 0.9) ) +
stat_summary(geom = "errorbar", position = position_dodge(width = 0.9), width = 0.4) +
theme(panel.background = element_blank(), plot.title = element_text(hjust = 0.5)) +
labs(y = "Cluster", fill = "", x = "Expression Level", title = "Abca1") +
scale_fill_manual(values = c(DMSO = "#dfdfde", GW = "#bdc9e2")),
ncol = 3)
## [1] "data.frame"
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS/LAPACK: /Users/jenfra/miniconda3/envs/Das_RNASeq/lib/libopenblasp-r0.3.10.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] KernSmooth_2.23-20 fields_12.3 viridis_0.6.1
## [4] viridisLite_0.4.0 spam_2.6-0 dotCall64_1.0-1
## [7] DoubletFinder_2.0.3 dplyr_1.0.6 venn_1.10
## [10] clustree_0.4.3 ggraph_2.0.5 rafalib_1.0.0
## [13] pheatmap_1.0.12 cowplot_1.1.1 ggplot2_3.3.3
## [16] Matrix_1.3-2 Seurat_3.1.2
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 Rtsne_0.15 colorspace_2.0-1
## [4] ellipsis_0.3.2 ggridges_0.5.3 farver_2.1.0
## [7] leiden_0.3.8 listenv_0.8.0 graphlayouts_0.7.1
## [10] ggrepel_0.9.1 fansi_0.5.0 mvtnorm_1.1-1
## [13] mathjaxr_1.4-0 codetools_0.2-18 splines_3.6.3
## [16] R.methodsS3_1.8.1 mnormt_2.0.2 knitr_1.33
## [19] TFisher_0.2.0 polyclip_1.10-0 jsonlite_1.7.2
## [22] ica_1.0-2 cluster_2.1.2 png_0.1-7
## [25] R.oo_1.24.0 uwot_0.1.10 ggforce_0.3.3
## [28] sctransform_0.3.2 compiler_3.6.3 httr_1.4.2
## [31] assertthat_0.2.1 lazyeval_0.2.2 tweenr_1.0.2
## [34] admisc_0.15 htmltools_0.5.1.1 tools_3.6.3
## [37] rsvd_1.0.3 igraph_1.2.6 gtable_0.3.0
## [40] glue_1.4.2 RANN_2.6.1 reshape2_1.4.4
## [43] maps_3.3.0 Rcpp_1.0.6 Biobase_2.46.0
## [46] jquerylib_0.1.4 vctrs_0.3.8 multtest_2.42.0
## [49] ape_5.5 nlme_3.1-152 lmtest_0.9-38
## [52] xfun_0.23 stringr_1.4.0 globals_0.14.0
## [55] rbibutils_2.1.1 lifecycle_1.0.0 irlba_2.3.3
## [58] future_1.21.0 MASS_7.3-54 zoo_1.8-9
## [61] scales_1.1.1 tidygraph_1.2.0 parallel_3.6.3
## [64] sandwich_3.0-1 RColorBrewer_1.1-2 yaml_2.2.1
## [67] reticulate_1.20 pbapply_1.4-3 gridExtra_2.3
## [70] sass_0.4.0 stringi_1.6.2 highr_0.9
## [73] mutoss_0.1-12 plotrix_3.8-1 BiocGenerics_0.32.0
## [76] Rdpack_2.1.2 SDMTools_1.1-221.2 rlang_0.4.11
## [79] pkgconfig_2.0.3 matrixStats_0.59.0 evaluate_0.14
## [82] lattice_0.20-44 ROCR_1.0-11 purrr_0.3.4
## [85] htmlwidgets_1.5.3 tidyselect_1.1.1 parallelly_1.25.0
## [88] RcppAnnoy_0.0.18 plyr_1.8.6 magrittr_2.0.1
## [91] bookdown_0.22 R6_2.5.0 generics_0.1.0
## [94] multcomp_1.4-17 DBI_1.1.1 withr_2.4.2
## [97] pillar_1.6.1 sn_2.0.0 fitdistrplus_1.1-5
## [100] survival_3.2-11 tsne_0.1-3 tibble_3.1.2
## [103] future.apply_1.7.0 crayon_1.4.1 utf8_1.2.1
## [106] tmvnsim_1.0-2 plotly_4.9.3 rmarkdown_2.8
## [109] data.table_1.14.0 metap_1.4 digest_0.6.27
## [112] tidyr_1.1.3 numDeriv_2016.8-1.1 R.utils_2.10.1
## [115] stats4_3.6.3 munsell_0.5.0 bslib_0.2.5.1